# Purpose of the code:
# Use the normalized values and look for patterns of change in feature values
# Plot change of values for a feature in a graph for all patients
# Plot change of values for a feature in a separate graph for each patient
# necessary imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math as math
from scipy import stats
%matplotlib inline
# switch to a proper directory to access the data
pwd
cd /camdatalake/bronze/verily_ms/device/
pwd
# download and read the data
# FeatureDay: Average value of the features for each day of study. Days are listed as
# DayOfStudy
# FeatureStudy: Features for the entire study period.For the at home features,
# the reported value is the median of the observed day level values.
import gzip, csv
with gzip.open("FeaturesDay.csv.gz", "rt", newline="") as file:
FeatureDay = pd.read_csv(file, header = 0)
with gzip.open("FeaturesStudy.csv.gz", "rt", newline="") as file:
FeatureStudy = pd.read_csv(file, header = 0)
# explore the dataset
FeatureDay.info()
FeatureDay.describe()
FeatureDay.head()
# extract the list of features in the dataset
list(FeatureDay.columns)
# found list of unique IDs for patients
patient_IDs = list(FeatureDay['gls_subject_code'].unique())
patient_IDs
# 10 free-living feature with high correlation to MSFC scores
# we want to see change of them with days
# ['idle_minutes',
# 'turn_vel_std_ankle',
# 'swing',
# 'stance',
# 'duration_movement_count',
# 'turn_vel_max_ankle',
# 'turn_duration_ankle',
# 'duration_rem_count',
# 'rem_percent',
# 'movement_rate']
free_living_features_highly_correlated = ['idle_minutes',
'turn_vel_std_ankle',
'swing',
'stance',
'duration_movement_count',
'turn_vel_max_ankle',
'turn_duration_ankle',
'duration_rem_count',
'rem_percent',
'movement_rate']
# 19 highly correlated at home features (structures activity) to MSFC scores
# ['mean_pvt_delay_7_at_home',
# 'mobility_stance_at_home',
# 'mean_pvt_delay_at_home',
# 'pq_nondominant_rhythm_at_home',
# 'pq_nondominant_median_at_home',
# 'pq_dominant_rhythm_at_home',
# 'turn_vel_max_at_home',
# 'mobility_swing_at_home',
# 'zx_dominant_num_correct_at_home',
# 'turn_vel_std_at_home',
# 'turn_duration_ankle_at_home',
# 'turn_vel_max_ankle_at_home',
# 'mean_pvt_delay_5_at_home',
# 'zx_nondominant_median_at_home',
# 'zx_nondominant_num_correct_at_home',
# 'mean_pvt_delay_3_at_home',
# 'turn_vel_std_ankle_at_home',
# 'mobility_activity_at_home_time',
# 'mean_pvt_delay_1_at_home']
at_home_features_highly_correlated = ['mean_pvt_delay_7_at_home',
'mobility_stance_at_home',
'mean_pvt_delay_at_home',
'pq_nondominant_rhythm_at_home',
'pq_nondominant_median_at_home',
'pq_dominant_rhythm_at_home',
'turn_vel_max_at_home',
'mobility_swing_at_home',
'zx_dominant_num_correct_at_home',
'turn_vel_std_at_home',
'turn_duration_ankle_at_home',
'turn_vel_max_ankle_at_home',
'mean_pvt_delay_5_at_home',
'zx_nondominant_median_at_home',
'zx_nondominant_num_correct_at_home',
'mean_pvt_delay_3_at_home',
'turn_vel_std_ankle_at_home',
'mobility_activity_at_home_time',
'mean_pvt_delay_1_at_home']
# normalizing values in the dataset per column
df = FeatureDay.drop(['user_email','gls_subject_code', 'dayofstudy'],axis=1)
FeatureDay_norm = (df - df.mean()) / (df.max() - df.min())
FeatureDay_norm = pd.concat([FeatureDay[['user_email','gls_subject_code', 'dayofstudy']],FeatureDay_norm],axis=1)
FeatureDay_norm.head()
def remove_outliers(feature_values, day_of_study):
# a function to remove outliers from input dataset and return filtered dataset as the ouput
m = 1.5 # distance threshold from the mean
mean = feature_values.mean()
std = feature_values.std()
tuples = list(zip(feature_values,day_of_study))
filtered_values = []
for (x,y) in tuples:
if (x >= mean - m * std) & (x <= mean + m * std):
filtered_values.append((x,y))
unzip_filtered_values = list(zip(*filtered_values))
# check for missing values
if len(unzip_filtered_values) > 0:
return pd.Series(list(unzip_filtered_values[0])), pd.Series(list(unzip_filtered_values[1]))
else:
return pd.Series([]),day_of_study
def standardize_axis(feature):
# a function to standardize the axis
# remove outliers (both feature values & associated days of study), return filtered values
# use the filtered values to assign a range to axis
# we assume dataframes FeatureDay and patient_IDs are already defined
All_filtered_feature_values = []
All_filtered_days_of_studies = []
# loop on all the patients
for ID in patient_IDs:
# extract part of FeatureDay_norm that is related to a patient and the input feature as a new dataframe
col_1 = feature
col_2 = 'dayofstudy'
df = FeatureDay_norm[FeatureDay_norm['gls_subject_code'] == ID][[col_1,col_2]]
# sort the dataframe based on days of study
df.sort(col_2, inplace = True)
# create list of x: days of study, y: feature values
x = df[col_2]
y = df[col_1]
# remove outliers (both feature values & associated days of study)
y,x = remove_outliers(y,x)
# store all the filtered values
All_filtered_feature_values = All_filtered_feature_values + (pd.Series.tolist(y))
All_filtered_days_of_studies = All_filtered_days_of_studies + (pd.Series.tolist(x))
# set the axis ranges to the max value in the list of filtered values
max_y = (np.max(All_filtered_feature_values))
max_x = (np.max(All_filtered_days_of_studies))
# return the extracted ranges for axis
return max_y,max_x
def plot_feature_across_days_all_patients_in_one(feature):
# plot the measurments for a specific feature vs. days of study for all patients in a same graph
print(feature)
plt.figure(figsize=(20,10))
# loop on patients
for idx in range(len(patient_IDs)):
# extract the patient ID
ID = patient_IDs[idx]
# extract two columns as a dataframe
col_1 = feature
col_2 = 'dayofstudy'
df = FeatureDay_norm[FeatureDay_norm['gls_subject_code'] == ID][[col_1,col_2]]
# sort the dataframe based on days of study
df.sort(col_2, inplace = True)
x = df[col_2]
y = df[col_1]
# set the row and column numbers based on the fact that we have 25 patients
row = 0
col = 0
# change the font in plots
import matplotlib
font = {'family' : 'normal',
'size' : 50}
matplotlib.rc('font', **font)
# standardize the axis
max_y,max_x = standardize_axis(feature)
plt.xlabel('Days of Study')
plt.ylabel(feature)
plt.xlim(0, max_x)
plt.ylim(-1, 1)
# plot the measurments vs. days
if (len(y.unique()) == 1) & (np.isnan(y.unique()).sum() == 1):
continue
else:
y,x = remove_outliers(y,x)
if len(y) == 0:
continue
else:
plt.plot(x,y,label = ID)
plt.legend(loc = 3)
def plot_feature_across_days(feature):
# plot the measurments for a specific feature vs. days of study
# each patient in a separate subplot
figs, axes = plt.subplots(nrows= 5, ncols= 5,figsize=(20,20),dpi = 200)
print(feature)
# loop on patients
for idx in range(len(patient_IDs)):
# extract the patient ID
ID = patient_IDs[idx]
# extract two columns as a dataframe
col_1 = feature
col_2 = 'dayofstudy'
df = FeatureDay_norm[FeatureDay_norm['gls_subject_code'] == ID][[col_1,col_2]]
# sort the dataframe based on days of study
df.sort(col_2, inplace = True)
x = df[col_2]
y = df[col_1]
# set the row and column numbers based on the fact that we have 25 patients
row = idx // 5
col = idx % 5
# standardize the axis
max_y,max_x = standardize_axis(feature)
axes[row,col].set_xlim(0, max_x)
axes[row,col].set_ylim(-1, 1)
axes[row,col].set_title(ID,y=0.9)
axes[row,col].set_xlabel('Days of Study')
axes[row,col].set_ylabel(feature)
# plot the measurments vs. days of study
if (len(y.unique()) == 1) & (np.isnan(y.unique()).sum() == 1):
pass
else:
y,x = remove_outliers(y,x)
if len(y) == 0:
pass
else:
axes[row,col].plot(x,y)
# plot free-living features
feature = free_living_features_highly_correlated[0]
plot_feature_across_days_all_patients_in_one(feature)
feature = free_living_features_highly_correlated[0]
plot_feature_across_days(feature)
feature = free_living_features_highly_correlated[1]
plot_feature_across_days_all_patients_in_one(feature)
feature = free_living_features_highly_correlated[1]
plot_feature_across_days(feature)
feature = free_living_features_highly_correlated[2]
plot_feature_across_days_all_patients_in_one(feature)
feature = free_living_features_highly_correlated[2]
plot_feature_across_days(feature)
feature = free_living_features_highly_correlated[3]
plot_feature_across_days_all_patients_in_one(feature)
feature = free_living_features_highly_correlated[3]
plot_feature_across_days(feature)
feature = free_living_features_highly_correlated[4]
plot_feature_across_days_all_patients_in_one(feature)
feature = free_living_features_highly_correlated[4]
plot_feature_across_days(feature)
feature = free_living_features_highly_correlated[5]
plot_feature_across_days_all_patients_in_one(feature)
feature = free_living_features_highly_correlated[5]
plot_feature_across_days(feature)
feature = free_living_features_highly_correlated[6]
plot_feature_across_days_all_patients_in_one(feature)
feature = free_living_features_highly_correlated[6]
plot_feature_across_days(feature)
feature = free_living_features_highly_correlated[7]
plot_feature_across_days_all_patients_in_one(feature)
feature = free_living_features_highly_correlated[7]
plot_feature_across_days(feature)
feature = free_living_features_highly_correlated[8]
plot_feature_across_days_all_patients_in_one(feature)
feature = free_living_features_highly_correlated[8]
plot_feature_across_days(feature)
feature = free_living_features_highly_correlated[9]
plot_feature_across_days_all_patients_in_one(feature)
feature = free_living_features_highly_correlated[9]
plot_feature_across_days(feature)
# plot at-home features (structured activity)
feature = at_home_features_highly_correlated[0]
plot_feature_across_days_all_patients_in_one(feature)
feature = at_home_features_highly_correlated[0]
plot_feature_across_days(feature)
feature = at_home_features_highly_correlated[1]
plot_feature_across_days_all_patients_in_one(feature)
feature = at_home_features_highly_correlated[1]
plot_feature_across_days(feature)
feature = at_home_features_highly_correlated[2]
plot_feature_across_days_all_patients_in_one(feature)
feature = at_home_features_highly_correlated[2]
plot_feature_across_days(feature)
feature = at_home_features_highly_correlated[3]
plot_feature_across_days_all_patients_in_one(feature)
feature = at_home_features_highly_correlated[3]
plot_feature_across_days(feature)
feature = at_home_features_highly_correlated[4]
plot_feature_across_days_all_patients_in_one(feature)
feature = at_home_features_highly_correlated[4]
plot_feature_across_days(feature)
feature = at_home_features_highly_correlated[5]
plot_feature_across_days_all_patients_in_one(feature)
feature = at_home_features_highly_correlated[5]
plot_feature_across_days(feature)
# enlarge the font size
import matplotlib
font = {'family' : 'normal',
'weight' : 'normal',
'size' : 50}
matplotlib.rc('font', **font)
# plot
feature = at_home_features_highly_correlated[6]
plot_feature_across_days_all_patients_in_one(feature)
feature = at_home_features_highly_correlated[6]
plot_feature_across_days(feature)
feature = at_home_features_highly_correlated[7]
plot_feature_across_days_all_patients_in_one(feature)
feature = at_home_features_highly_correlated[7]
plot_feature_across_days(feature)
feature = at_home_features_highly_correlated[8]
plot_feature_across_days_all_patients_in_one(feature)
feature = at_home_features_highly_correlated[8]
plot_feature_across_days(feature)
feature = at_home_features_highly_correlated[9]
plot_feature_across_days_all_patients_in_one(feature)
feature = at_home_features_highly_correlated[9]
plot_feature_across_days(feature)
feature = at_home_features_highly_correlated[10]
plot_feature_across_days_all_patients_in_one(feature)
feature = at_home_features_highly_correlated[10]
plot_feature_across_days(feature)
feature = at_home_features_highly_correlated[11]
plot_feature_across_days_all_patients_in_one(feature)
feature = at_home_features_highly_correlated[11]
plot_feature_across_days(feature)
feature = at_home_features_highly_correlated[12]
plot_feature_across_days_all_patients_in_one(feature)
feature = at_home_features_highly_correlated[12]
plot_feature_across_days(feature)
feature = at_home_features_highly_correlated[13]
plot_feature_across_days_all_patients_in_one(feature)
feature = at_home_features_highly_correlated[13]
plot_feature_across_days(feature)
feature = at_home_features_highly_correlated[14]
plot_feature_across_days_all_patients_in_one(feature)
feature = at_home_features_highly_correlated[14]
plot_feature_across_days(feature)
feature = at_home_features_highly_correlated[15]
plot_feature_across_days_all_patients_in_one(feature)
feature = at_home_features_highly_correlated[15]
plot_feature_across_days(feature)
feature = at_home_features_highly_correlated[16]
plot_feature_across_days_all_patients_in_one(feature)
feature = at_home_features_highly_correlated[16]
plot_feature_across_days(feature)
feature = at_home_features_highly_correlated[17]
plot_feature_across_days_all_patients_in_one(feature)
feature = at_home_features_highly_correlated[17]
plot_feature_across_days(feature)
feature = at_home_features_highly_correlated[18]
plot_feature_across_days_all_patients_in_one(feature)
feature = at_home_features_highly_correlated[18]
plot_feature_across_days(feature)